In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
In [3]:
df = pd.read_csv('F:\Documents\GitHub\Rapido_Dataset.csv')
In [4]:
df.head()
Out[4]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0
In [5]:
import pandas as pd

# Convert Unix timestamp to a readable date and time
df['date_time'] = pd.to_datetime(df['timestamp'], unit='ms')

# Display the first few rows to see the conversion
print(df[['timestamp', 'date_time']].head())
      timestamp               date_time
0  1.546709e+12 2019-01-05 17:27:50.211
1  1.546709e+12 2019-01-05 17:28:29.524
2  1.546709e+12 2019-01-05 17:28:51.857
3  1.546709e+12 2019-01-05 17:29:18.403
4  1.546709e+12 2019-01-05 17:29:46.884
In [6]:
df
Out[6]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.211
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.524
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884
... ... ... ... ... ... ... ... ... ... ... ...
53421 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53422 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53423 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53424 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53425 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT

53426 rows × 11 columns

In [7]:
import pandas as pd

# Assume df is your DataFrame and you have already done the cleaning steps
# Convert Unix timestamp to a readable date and time
df['date_time'] = pd.to_datetime(df['timestamp'], unit='ms')

# Display the cleaned DataFrame to verify the new column
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare               date_time  
0  78.399056            2.806    12.609667       37.0 2019-01-05 17:27:50.211  
1  78.367294           11.991    24.075200      119.0 2019-01-05 17:28:29.524  
2  78.517921            1.322     8.708300       27.0 2019-01-05 17:28:51.857  
3  78.516586           11.822    24.037550      121.0 2019-01-05 17:29:18.403  
4  78.400032            6.978    16.120867       58.0 2019-01-05 17:29:46.884  
In [9]:
import pandas as pd

# Display all rows that contain any None (NaN) values
none_values = df[df.isnull().any(axis=1)]

# Display the rows with None values
print(none_values)
      trip_id customer_id  timestamp  pick_lat  pick_lng  drop_lat  drop_lng  \
44587     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
44588     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
44589     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
44590     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
44591     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
...       ...         ...        ...       ...       ...       ...       ...   
53421     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
53422     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
53423     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
53424     NaN         NaN        NaN       NaN       NaN       NaN       NaN   
53425     NaN         NaN        NaN       NaN       NaN       NaN       NaN   

       travel_distance  travel_time  trip_fare date_time  
44587              NaN          NaN        NaN       NaT  
44588              NaN          NaN        NaN       NaT  
44589              NaN          NaN        NaN       NaT  
44590              NaN          NaN        NaN       NaT  
44591              NaN          NaN        NaN       NaT  
...                ...          ...        ...       ...  
53421              NaN          NaN        NaN       NaT  
53422              NaN          NaN        NaN       NaT  
53423              NaN          NaN        NaN       NaT  
53424              NaN          NaN        NaN       NaT  
53425              NaN          NaN        NaN       NaT  

[8839 rows x 11 columns]
In [10]:
# Calculate the number of null values in each column
null_counts = df.isnull().sum()

# Display the number of null values for each column
print(null_counts)
trip_id            8839
customer_id        8839
timestamp          8839
pick_lat           8839
pick_lng           8839
drop_lat           8839
drop_lng           8839
travel_distance    8839
travel_time        8839
trip_fare          8839
date_time          8839
dtype: int64
In [11]:
import pandas as pd

# Convert Unix timestamp to a readable date and time by dividing by 1000
df['date_time'] = pd.to_datetime(df['timestamp'] / 1000, unit='s')

# Display the first few rows to verify the conversion
print(df[['timestamp', 'date_time']].head())
      timestamp                     date_time
0  1.546709e+12 2019-01-05 17:27:50.210999966
1  1.546709e+12 2019-01-05 17:28:29.523999929
2  1.546709e+12 2019-01-05 17:28:51.857000113
3  1.546709e+12 2019-01-05 17:29:18.403000116
4  1.546709e+12 2019-01-05 17:29:46.884000063
In [12]:
df
Out[12]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063
... ... ... ... ... ... ... ... ... ... ... ...
53421 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53422 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53423 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53424 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT
53425 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT

53426 rows × 11 columns

In [13]:
# Split the 'date_time' column into 'date' and 'time' columns
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time

# Display the updated DataFrame to verify the new columns
print(df[['date_time', 'date', 'time']].head())
                      date_time        date             time
0 2019-01-05 17:27:50.210999966  2019-01-05  17:27:50.210999
1 2019-01-05 17:28:29.523999929  2019-01-05  17:28:29.523999
2 2019-01-05 17:28:51.857000113  2019-01-05  17:28:51.857000
3 2019-01-05 17:29:18.403000116  2019-01-05  17:29:18.403000
4 2019-01-05 17:29:46.884000063  2019-01-05  17:29:46.884000
In [14]:
# Drop all rows with null values
df = df.dropna()

# Display the updated DataFrame to verify the changes
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time        date             time  
0 2019-01-05 17:27:50.210999966  2019-01-05  17:27:50.210999  
1 2019-01-05 17:28:29.523999929  2019-01-05  17:28:29.523999  
2 2019-01-05 17:28:51.857000113  2019-01-05  17:28:51.857000  
3 2019-01-05 17:29:18.403000116  2019-01-05  17:29:18.403000  
4 2019-01-05 17:29:46.884000063  2019-01-05  17:29:46.884000  
In [15]:
df
Out[15]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time date time
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 2019-01-03 16:04:51.043999
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 2019-01-03 16:05:05.868999
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 2019-01-03 16:05:22.102999
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 2019-01-03 16:05:36.428999
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 2019-01-03 16:05:49.937999

44587 rows × 13 columns

In [16]:
# Extract the year from the 'date_time' column and create a new 'year' column
df['year'] = df['date_time'].dt.year

# Display the DataFrame to verify the new column
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time        date             time  year  
0 2019-01-05 17:27:50.210999966  2019-01-05  17:27:50.210999  2019  
1 2019-01-05 17:28:29.523999929  2019-01-05  17:28:29.523999  2019  
2 2019-01-05 17:28:51.857000113  2019-01-05  17:28:51.857000  2019  
3 2019-01-05 17:29:18.403000116  2019-01-05  17:29:18.403000  2019  
4 2019-01-05 17:29:46.884000063  2019-01-05  17:29:46.884000  2019  
In [17]:
df
Out[17]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time date time year
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999 2019
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999 2019
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000 2019
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000 2019
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000 2019
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 2019-01-03 16:04:51.043999 2019
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 2019-01-03 16:05:05.868999 2019
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 2019-01-03 16:05:22.102999 2019
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 2019-01-03 16:05:36.428999 2019
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 2019-01-03 16:05:49.937999 2019

44587 rows × 14 columns

In [18]:
# Drop the 'year' column
df = df.drop(columns=['year'])

# Display the DataFrame to verify the column is removed
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time        date             time  
0 2019-01-05 17:27:50.210999966  2019-01-05  17:27:50.210999  
1 2019-01-05 17:28:29.523999929  2019-01-05  17:28:29.523999  
2 2019-01-05 17:28:51.857000113  2019-01-05  17:28:51.857000  
3 2019-01-05 17:29:18.403000116  2019-01-05  17:29:18.403000  
4 2019-01-05 17:29:46.884000063  2019-01-05  17:29:46.884000  
In [19]:
df
Out[19]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time date time
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 2019-01-03 16:04:51.043999
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 2019-01-03 16:05:05.868999
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 2019-01-03 16:05:22.102999
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 2019-01-03 16:05:36.428999
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 2019-01-03 16:05:49.937999

44587 rows × 13 columns

In [20]:
import pandas as pd

# Extract month from 'date_time' column
df['month'] = df['date_time'].dt.month

# Create separate DataFrames for pickup and drop-off locations
pickup_data = df[['month', 'pick_lat', 'pick_lng']]
dropoff_data = df[['month', 'drop_lat', 'drop_lng']]

# Rename columns for consistency
pickup_data.columns = ['month', 'latitude', 'longitude']
dropoff_data.columns = ['month', 'latitude', 'longitude']
In [21]:
df
Out[21]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time date time month
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999 1
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999 1
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000 1
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000 1
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 2019-01-03 16:04:51.043999 1
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 2019-01-03 16:05:05.868999 1
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 2019-01-03 16:05:22.102999 1
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 2019-01-03 16:05:36.428999 1
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 2019-01-03 16:05:49.937999 1

44587 rows × 14 columns

In [22]:
import folium
from folium.plugins import HeatMap

# Function to create a heatmap for a given month
def create_heatmap(data, month, map_title):
    m = folium.Map(location=[data['latitude'].mean(), data['longitude'].mean()], zoom_start=12)
    HeatMap(data[data['month'] == month][['latitude', 'longitude']].values, radius=10).add_to(m)
    m.save(f'{map_title}_month_{month}.html')

# Create heatmaps for each month
for month in pickup_data['month'].unique():
    create_heatmap(pickup_data, month, 'Pickup_Hotspots')
    create_heatmap(dropoff_data, month, 'Dropoff_Hotspots')
In [23]:
!pip install folium
Requirement already satisfied: folium in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (0.18.0)
Requirement already satisfied: branca>=0.6.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (0.8.0)
Requirement already satisfied: jinja2>=2.9 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (3.1.4)
Requirement already satisfied: numpy in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (2.1.2)
Requirement already satisfied: requests in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (2.32.3)
Requirement already satisfied: xyzservices in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from folium) (2024.9.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from jinja2>=2.9->folium) (3.0.1)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from requests->folium) (2024.8.30)
In [24]:
import pandas as pd

# Assuming df is your DataFrame and it has a column 'date_time' in datetime format
# Extract the date part (without year, month, or time)
df['date_only'] = df['date_time'].dt.strftime('%d-%m')

# Display all unique dates
unique_dates = df['date_only'].unique()
print("Unique Dates:", unique_dates)
Unique Dates: ['05-01' '03-01' '02-01' '07-01' '06-01' '04-01' '01-01']
In [25]:
# Drop the 'month' column
df = df.drop(columns=['month'])

# Display the DataFrame to verify the column is removed
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time        date             time date_only  
0 2019-01-05 17:27:50.210999966  2019-01-05  17:27:50.210999     05-01  
1 2019-01-05 17:28:29.523999929  2019-01-05  17:28:29.523999     05-01  
2 2019-01-05 17:28:51.857000113  2019-01-05  17:28:51.857000     05-01  
3 2019-01-05 17:29:18.403000116  2019-01-05  17:29:18.403000     05-01  
4 2019-01-05 17:29:46.884000063  2019-01-05  17:29:46.884000     05-01  
In [26]:
df
Out[26]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time date time date_only
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 2019-01-05 17:27:50.210999 05-01
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 2019-01-05 17:28:29.523999 05-01
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 2019-01-05 17:28:51.857000 05-01
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 2019-01-05 17:29:18.403000 05-01
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 2019-01-05 17:29:46.884000 05-01
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 2019-01-03 16:04:51.043999 03-01
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 2019-01-03 16:05:05.868999 03-01
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 2019-01-03 16:05:22.102999 03-01
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 2019-01-03 16:05:36.428999 03-01
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 2019-01-03 16:05:49.937999 03-01

44587 rows × 14 columns

In [27]:
# Drop the 'date' column
df = df.drop(columns=['date'])

# Rename 'date_only' to 'date'
df = df.rename(columns={'date_only': 'date'})

# Display the DataFrame to verify the changes
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time   date  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  05-01  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  05-01  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  05-01  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  05-01  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  05-01  
In [28]:
df
Out[28]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time time date
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 17:27:50.210999 05-01
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 17:28:29.523999 05-01
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 17:28:51.857000 05-01
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 17:29:18.403000 05-01
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 17:29:46.884000 05-01
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 16:04:51.043999 03-01
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 16:05:05.868999 03-01
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 16:05:22.102999 03-01
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 16:05:36.428999 03-01
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 16:05:49.937999 03-01

44587 rows × 13 columns

In [29]:
# Count the number of unique values in the 'trip_id' column
unique_trip_ids = df['trip_id'].nunique()

# Display the number of unique trip IDs
print(f'Number of unique trip IDs: {unique_trip_ids}')
Number of unique trip IDs: 44587
In [30]:
import pandas as pd

# Extract the date from the 'date_time' column
df['date'] = df['date_time'].dt.date

# Create separate DataFrames for pickup and drop-off locations
pickup_data = df[['date', 'pick_lat', 'pick_lng']]
dropoff_data = df[['date', 'drop_lat', 'drop_lng']]

# Rename columns for consistency
pickup_data.columns = ['date', 'latitude', 'longitude']
dropoff_data.columns = ['date', 'latitude', 'longitude']
In [31]:
df
Out[31]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time time date
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 16:04:51.043999 2019-01-03
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 16:05:05.868999 2019-01-03
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 16:05:22.102999 2019-01-03
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 16:05:36.428999 2019-01-03
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 16:05:49.937999 2019-01-03

44587 rows × 13 columns

In [32]:
import folium
from folium.plugins import HeatMap

# Function to create a heatmap for a given date
def create_heatmap(data, day, map_title):
    m = folium.Map(location=[data['latitude'].mean(), data['longitude'].mean()], zoom_start=12)
    HeatMap(data[data['date'] == day][['latitude', 'longitude']].values, radius=10).add_to(m)
    m.save(f'{map_title}_day_{day}.html')

# Create heatmaps for each date
for day in pickup_data['date'].unique():
    create_heatmap(pickup_data, day, 'Pickup_Hotspots')
    create_heatmap(dropoff_data, day, 'Dropoff_Hotspots')
In [33]:
import folium
from folium.plugins import HeatMap

# Average location to center the map around Telangana
center_lat = 17.123184
center_lng = 79.208824

# Function to create a heatmap for a given date
def create_heatmap(data, day, map_title):
    m = folium.Map(location=[center_lat, center_lng], zoom_start=10)  # Adjust zoom_start for optimal view
    HeatMap(data[data['date'] == day][['latitude', 'longitude']].values, radius=10).add_to(m)
    m.save(f'{map_title}_day_{day}.html')
    return m

# Generate and display heatmaps for each day
for day in pickup_data['date'].unique():
    pickup_map = create_heatmap(pickup_data, day, 'Pickup_Hotspots')
    dropoff_map = create_heatmap(dropoff_data, day, 'Dropoff_Hotspots')
    
    # Display the maps in Jupyter Notebook
    display(pickup_map)
    display(dropoff_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
In [34]:
import folium
from folium.plugins import HeatMap

# Average location to center the map around Telangana
center_lat = 17.123184
center_lng = 79.208824

# Function to create a heatmap for a given date
def create_heatmap(data, day, map_title):
    m = folium.Map(location=[center_lat, center_lng], zoom_start=10)  # Adjust zoom_start for optimal view
    HeatMap(data[data['date'] == day][['latitude', 'longitude']].values, radius=10).add_to(m)
    m.save(f'{map_title}_day_{day}.html')
    return m

# Generate and display heatmaps for each day
for day in pickup_data['date'].unique():
    pickup_map = create_heatmap(pickup_data, day, 'Pickup_Hotspots')
    dropoff_map = create_heatmap(dropoff_data, day, 'Dropoff_Hotspots')
    
    # Display the maps in Jupyter Notebook
    display(pickup_map)
    display(dropoff_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
Make this Notebook Trusted to load map: File -> Trust Notebook
In [35]:
import pandas as pd
import matplotlib.pyplot as plt

# Assume df is your DataFrame and contains a 'date' column and 'pick_lat' for pickup locations
# Group data by date and count the number of bookings per date
bookings_per_day = df.groupby('date')['pick_lat'].count()

# Plot the bar chart
plt.figure(figsize=(8, 4))
plt.bar(bookings_per_day.index, bookings_per_day.values, color='skyblue')
plt.xlabel('Date', fontsize=15)
plt.ylabel('Number of Bookings', fontsize=15)
plt.title('Number of Bookings by Date', fontsize=18)
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [36]:
df
Out[36]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time time date
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 16:04:51.043999 2019-01-03
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 16:05:05.868999 2019-01-03
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 16:05:22.102999 2019-01-03
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 16:05:36.428999 2019-01-03
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 16:05:49.937999 2019-01-03

44587 rows × 13 columns

In [37]:
import pandas as pd

# Extract the hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour

# Group data by hour and count the number of bookings per hour
bookings_per_hour = df.groupby('hour')['pick_lat'].count()

# Plot the bar chart
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 8))
plt.bar(bookings_per_hour.index, bookings_per_hour.values, color='skyblue')
plt.xlabel('Hour', fontsize=15)
plt.ylabel('Number of Bookings', fontsize=15)
plt.title('Number of Bookings by Hour', fontsize=18)
plt.xticks(range(0, 24))
plt.show()
No description has been provided for this image
In [38]:
import pandas as pd

# Assume df is your DataFrame and contains 'date_time', 'pick_lat', 'pick_lng' columns
# Extract the hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour

# Define a function to identify the top hours for each hotspot location
def get_hotspot_hours(data, location_col, lat_col, lng_col):
    # Group by location and hour, then count the number of bookings
    hotspot_hours = data.groupby([lat_col, lng_col, 'hour'])[location_col].count().reset_index()
    hotspot_hours.columns = [lat_col, lng_col, 'hour', 'bookings']
    
    # Find the hour with the maximum bookings for each location
    max_bookings_per_hour = hotspot_hours.loc[hotspot_hours.groupby([lat_col, lng_col])['bookings'].idxmax()]
    
    return max_bookings_per_hour

# Get hotspot hours for pickup locations
pickup_hotspot_hours = get_hotspot_hours(df, 'trip_id', 'pick_lat', 'pick_lng')

# Display the results
print(pickup_hotspot_hours)
        pick_lat   pick_lng  hour  bookings
0      17.330339  78.530975    14         1
1      17.334276  78.532150     8         1
2      17.334480  78.549294    14         1
3      17.334595  78.552040    13         1
4      17.334942  78.569489     6         1
...          ...        ...   ...       ...
44574  17.528427  78.388962    15         1
44575  17.528849  78.385376     6         1
44576  17.528875  78.385345     8         1
44577  17.528885  78.385391    14         1
44578  17.529791  78.434471     4         1

[44541 rows x 4 columns]
In [40]:
import pandas as pd

# Group by latitude and longitude to count the number of bookings per location
location_bookings = df.groupby(['pick_lat', 'pick_lng']).size().reset_index(name='number_of_bookings')

# Extract the hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
In [41]:
# Define a function to find the peak booking hour for each location
def peak_booking_hour(data, lat_col, lng_col, hour_col):
    # Group by location and hour, then count the number of bookings
    hourly_bookings = data.groupby([lat_col, lng_col, hour_col]).size().reset_index(name='hourly_bookings')
    
    # Find the hour with the maximum bookings for each location
    peak_hours = hourly_bookings.loc[hourly_bookings.groupby([lat_col, lng_col])['hourly_bookings'].idxmax()]
    return peak_hours

# Get peak booking hours for each location
peak_hours = peak_booking_hour(df, 'pick_lat', 'pick_lng', 'hour')

# Merge the booking counts with the peak hours
location_summary = pd.merge(location_bookings, peak_hours, on=['pick_lat', 'pick_lng'])
location_summary.columns = ['latitude', 'longitude', 'number_of_bookings', 'peak_booking_hour', 'hourly_bookings']

# Display the results
print(location_summary)
        latitude  longitude  number_of_bookings  peak_booking_hour  \
0      17.330339  78.530975                   1                 14   
1      17.334276  78.532150                   1                  8   
2      17.334480  78.549294                   1                 14   
3      17.334595  78.552040                   1                 13   
4      17.334942  78.569489                   1                  6   
...          ...        ...                 ...                ...   
44536  17.528427  78.388962                   1                 15   
44537  17.528849  78.385376                   1                  6   
44538  17.528875  78.385345                   1                  8   
44539  17.528885  78.385391                   1                 14   
44540  17.529791  78.434471                   1                  4   

       hourly_bookings  
0                    1  
1                    1  
2                    1  
3                    1  
4                    1  
...                ...  
44536                1  
44537                1  
44538                1  
44539                1  
44540                1  

[44541 rows x 5 columns]
In [42]:
import pandas as pd

# Assuming df is your DataFrame and it has 'pick_lat' and 'pick_lng' columns
# Extract unique pickup locations
unique_locations = df[['pick_lat', 'pick_lng']].drop_duplicates().reset_index(drop=True)

# Display the unique locations
print(unique_locations)
        pick_lat   pick_lng
0      17.442705  78.387878
1      17.490189  78.415512
2      17.370108  78.515045
3      17.439314  78.443001
4      17.432325  78.381966
...          ...        ...
44536  17.443661  78.391968
44537  17.439289  78.396118
44538  17.363689  78.535194
44539  17.401539  78.570076
44540  17.385243  78.479896

[44541 rows x 2 columns]
In [43]:
import pandas as pd

# Assuming df is your DataFrame and it has 'drop_lat' and 'drop_lng' columns
# Extract unique drop-off locations
unique_drop_locations = df[['drop_lat', 'drop_lng']].drop_duplicates().reset_index(drop=True)

# Display the unique drop-off locations
print(unique_drop_locations)
        drop_lat   drop_lng
0      17.457829  78.399056
1      17.450548  78.367294
2      17.377041  78.517921
3      17.397131  78.516586
4      17.401625  78.400032
...          ...        ...
44522  17.451042  78.371658
44523  17.449976  78.389160
44524  17.374418  78.529823
44525  17.416904  78.591362
44526  17.394102  78.499550

[44527 rows x 2 columns]
In [44]:
import pandas as pd

# Group by pickup locations and count the number of bookings per location
pickup_counts = df.groupby(['pick_lat', 'pick_lng'])['trip_id'].count().reset_index(name='number_of_bookings')

# Find the location with the maximum bookings
max_pickup_location = pickup_counts.loc[pickup_counts['number_of_bookings'].idxmax()]

print(f"Pickup location with the most bookings: {max_pickup_location}")
Pickup location with the most bookings: pick_lat              17.497129
pick_lng              78.386581
number_of_bookings     4.000000
Name: 43913, dtype: float64
In [45]:
# Display rows 43912 to 43920
print(df.iloc[43911:43920])
       trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
43911  ID43912   CUST_2582  1.546527e+12  17.463076  78.369011  17.445284   
43912  ID43913  CUST_12709  1.546527e+12  17.440476  78.479790  17.441366   
43913  ID43914  CUST_10049  1.546527e+12  17.406050  78.390785  17.444227   
43914  ID43915  CUST_18971  1.546527e+12  17.388184  78.478332  17.436727   
43915  ID43916   CUST_3256  1.546527e+12  17.433126  78.446663  17.402319   
43916  ID43917    CUST_279  1.546527e+12  17.404165  78.447098  17.427950   
43917  ID43918  CUST_12861  1.546527e+12  17.386606  78.480926  17.403498   
43918  ID43919    CUST_066  1.546527e+12  17.455437  78.372810  17.436823   
43919  ID43920   CUST_1689  1.546527e+12  17.511930  78.364357  17.510769   

        drop_lng  travel_distance  travel_time  trip_fare  \
43911  78.385788            2.648     9.577733       36.0   
43912  78.469864            5.445     0.100767       52.0   
43913  78.390289            5.275    16.720600       52.0   
43914  78.496376            4.042     9.092983       38.0   
43915  78.483666            3.281    10.903133       38.0   
43916  78.451126            1.191     3.845300       25.0   
43917  78.481491           17.848    50.143417      198.0   
43918  78.435692            1.894     5.148433       28.0   
43919  78.401916            0.202     1.113883       20.0   

                          date_time             time        date  hour  
43911 2019-01-03 14:43:55.934999943  14:43:55.934999  2019-01-03    14  
43912 2019-01-03 14:44:05.562999964  14:44:05.562999  2019-01-03    14  
43913 2019-01-03 14:44:10.555000067  14:44:10.555000  2019-01-03    14  
43914 2019-01-03 14:44:30.289000034  14:44:30.289000  2019-01-03    14  
43915 2019-01-03 14:44:41.865000010  14:44:41.865000  2019-01-03    14  
43916 2019-01-03 14:44:44.759000063  14:44:44.759000  2019-01-03    14  
43917 2019-01-03 14:44:52.944000006  14:44:52.944000  2019-01-03    14  
43918 2019-01-03 14:44:53.720000029  14:44:53.720000  2019-01-03    14  
43919 2019-01-03 14:44:55.555999994  14:44:55.555999  2019-01-03    14  
In [46]:
import pandas as pd

# Group by pickup locations and count the number of bookings per location
pickup_counts = df.groupby(['pick_lat', 'pick_lng'])['trip_id'].count().reset_index(name='number_of_bookings')

# Filter locations with more than 2 bookings
pickup_counts = pickup_counts[pickup_counts['number_of_bookings'] >= 2]

# Sort by number of bookings and select the top 10 locations
top_10_pickup_locations = pickup_counts.nlargest(10, 'number_of_bookings')

# Display the results
print(top_10_pickup_locations)
        pick_lat   pick_lng  number_of_bookings
43913  17.497129  78.386581                   4
43906  17.497105  78.386574                   3
965    17.366550  78.551582                   2
4706   17.391233  78.489227                   2
5827   17.394249  78.517014                   2
8360   17.399475  78.509415                   2
9486   17.401743  78.560585                   2
10726  17.404352  78.464462                   2
15176  17.414572  78.507652                   2
15784  17.415865  78.407753                   2
In [47]:
import pandas as pd

# Group by pickup locations and count the number of bookings per location
pickup_counts = df.groupby(['pick_lat', 'pick_lng'])['trip_id'].count().reset_index(name='number_of_bookings')

# Filter locations with more than 1 booking
locations_with_multiple_bookings = pickup_counts[pickup_counts['number_of_bookings'] > 1]

# Display the results
print(locations_with_multiple_bookings)
        pick_lat   pick_lng  number_of_bookings
965    17.366550  78.551582                   2
4706   17.391233  78.489227                   2
5827   17.394249  78.517014                   2
8360   17.399475  78.509415                   2
9486   17.401743  78.560585                   2
10726  17.404352  78.464462                   2
15176  17.414572  78.507652                   2
15784  17.415865  78.407753                   2
17255  17.420282  78.347046                   2
19031  17.425179  78.378700                   2
19106  17.425318  78.378311                   2
21470  17.430283  78.511963                   2
22111  17.431818  78.427940                   2
22970  17.433174  78.488373                   2
23674  17.434418  78.500732                   2
23719  17.434484  78.500893                   2
23993  17.434870  78.501007                   2
25291  17.436710  78.456528                   2
25694  17.437117  78.384895                   2
25741  17.437151  78.384941                   2
27512  17.439583  78.394836                   2
28302  17.440767  78.386002                   2
30122  17.442688  78.369636                   2
30179  17.442709  78.369942                   2
30427  17.442869  78.369850                   2
31907  17.444504  78.461517                   2
32386  17.445173  78.464653                   2
33339  17.446653  78.365685                   2
34849  17.448181  78.496559                   2
34875  17.448202  78.496521                   2
34903  17.448215  78.496574                   2
34911  17.448219  78.496551                   2
34976  17.448282  78.496635                   2
36236  17.450579  78.394325                   2
37098  17.452213  78.369850                   2
37650  17.453846  78.365990                   2
37675  17.453878  78.365959                   2
38026  17.454683  78.371643                   2
38736  17.456329  78.377350                   2
39158  17.457748  78.372345                   2
42264  17.480785  78.555748                   2
43906  17.497105  78.386574                   3
43913  17.497129  78.386581                   4
In [48]:
import pandas as pd

# Combine pickup and drop-off locations into a single DataFrame
combined_locations = pd.concat([
    df[['pick_lat', 'pick_lng']].rename(columns={'pick_lat': 'latitude', 'pick_lng': 'longitude'}),
    df[['drop_lat', 'drop_lng']].rename(columns={'drop_lat': 'latitude', 'drop_lng': 'longitude'})
])

# Remove duplicates to keep only unique locations
unique_locations = combined_locations.drop_duplicates()

# Count the number of bookings for each unique location
location_counts = combined_locations.groupby(['latitude', 'longitude']).size().reset_index(name='number_of_bookings')

# Filter locations with more than 2 bookings
locations_with_multiple_bookings = location_counts[location_counts['number_of_bookings'] >= 2]

# Display the results
print(locations_with_multiple_bookings)
        latitude  longitude  number_of_bookings
175    17.341450  78.529335                   2
215    17.343428  78.537392                   2
222    17.343584  78.536438                   2
232    17.343851  78.534561                   2
562    17.350534  78.511032                   2
...          ...        ...                 ...
87365  17.498640  78.390190                   2
87421  17.498835  78.385956                   2
87554  17.499443  78.383965                   2
87990  17.506767  78.411156                   2
88299  17.516806  78.338860                   2

[647 rows x 3 columns]
In [50]:
import pandas as pd

# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')

# Find the customer with the maximum bookings
most_frequent_customer = customer_bookings.loc[customer_bookings['number_of_bookings'].idxmax()]

print(f"Customer with the most bookings: {most_frequent_customer}")
Customer with the most bookings: customer_id           CUST_279
number_of_bookings          53
Name: 11208, dtype: object
In [51]:
import pandas as pd

# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')

# Sort by number of bookings and select the top 15 customers
top_15_customers = customer_bookings.nlargest(10, 'number_of_bookings')

# Display the results
print(top_15_customers)
      customer_id  number_of_bookings
11208    CUST_279                  53
11550   CUST_3100                  25
12670   CUST_4119                  25
1432    CUST_1120                  21
2730    CUST_1237                  21
10691    CUST_232                  21
11438   CUST_2999                  20
16521    CUST_762                  20
10320   CUST_1982                  19
10453   CUST_2103                  19
In [68]:
import pandas as pd

# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')

# Calculate the average number of bookings per customer
average_bookings_per_customer = customer_bookings['number_of_bookings'].mean()

print(f'Average number of bookings per customer: {average_bookings_per_customer}')
Average number of bookings per customer: 2.3296410470766498
In [52]:
import pandas as pd

# Identify the trip with the highest travel time
max_travel_time_trip = df.loc[df['travel_time'].idxmax()]

# Extract the relevant details
highest_travel_time = max_travel_time_trip['travel_time']
respective_trip_fare = max_travel_time_trip['trip_fare']

print(f'Highest travel time: {highest_travel_time}')
print(f'Respective trip fare: {respective_trip_fare}')
Highest travel time: 4134.3887
Respective trip fare: 60.0
In [53]:
df
Out[53]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time time date hour
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 16:04:51.043999 2019-01-03 16
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 16:05:05.868999 2019-01-03 16
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 16:05:22.102999 2019-01-03 16
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 16:05:36.428999 2019-01-03 16
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 16:05:49.937999 2019-01-03 16

44587 rows × 14 columns

In [54]:
import pandas as pd

# Identify the trip with the highest trip fare
max_trip_fare_trip = df.loc[df['trip_fare'].idxmax()]

# Extract the relevant details
highest_trip_fare = max_trip_fare_trip['trip_fare']
respective_travel_time = max_trip_fare_trip['travel_time']

print(f'Highest trip fare: {highest_trip_fare}')
print(f'Respective travel time: {respective_travel_time}')
Highest trip fare: 1670.0
Respective travel time: 267.825483
In [55]:
import pandas as pd

# Check anomalies in the highest travel time trip
highest_travel_time_trip = df.loc[df['travel_time'].idxmax()]
print(f'Anomalies in the highest travel time trip: \n{highest_travel_time_trip}')

# Check anomalies in the highest trip fare trip
highest_trip_fare_trip = df.loc[df['trip_fare'].idxmax()]
print(f'Anomalies in the highest trip fare trip: \n{highest_trip_fare_trip}')

# Identify outliers using IQR
Q1 = df[['travel_time', 'trip_fare']].quantile(0.25)
Q3 = df[['travel_time', 'trip_fare']].quantile(0.75)
IQR = Q3 - Q1

outliers = df[((df[['travel_time', 'trip_fare']] < (Q1 - 1.5 * IQR)) |(df[['travel_time', 'trip_fare']] > (Q3 + 1.5 * IQR))).any(axis=1)]
print(f'Identified outliers: \n{outliers}')
Anomalies in the highest travel time trip: 
trip_id                                  ID10317
customer_id                            CUST_7419
timestamp                        1546583372751.0
pick_lat                               17.370081
pick_lng                               78.479561
drop_lat                               17.393438
drop_lng                               78.492004
travel_distance                            6.889
travel_time                            4134.3887
trip_fare                                   60.0
date_time          2019-01-04 06:29:32.750999928
time                             06:29:32.750999
date                                  2019-01-04
hour                                           6
Name: 10316, dtype: object
Anomalies in the highest trip fare trip: 
trip_id                                  ID42531
customer_id                           CUST_18770
timestamp                        1546519406927.0
pick_lat                               17.425972
pick_lng                               78.340454
drop_lat                               17.441185
drop_lng                               78.391289
travel_distance                              9.4
travel_time                           267.825483
trip_fare                                 1670.0
date_time          2019-01-03 12:43:26.927000046
time                             12:43:26.927000
date                                  2019-01-03
hour                                          12
Name: 42530, dtype: object
Identified outliers: 
       trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
1        ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
3        ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
9        ID010    CUST_010  1.546709e+12  17.401064  78.418938  17.434597   
30       ID031    CUST_031  1.546710e+12  17.447433  78.379745  17.431671   
45       ID046    CUST_046  1.546710e+12  17.406568  78.496925  17.519095   
...        ...         ...           ...        ...        ...        ...   
44519  ID44520   CUST_1955  1.546531e+12  17.439194  78.448189  17.440910   
44520  ID44521   CUST_1446  1.546531e+12  17.400417  78.418739  17.441963   
44534  ID44535   CUST_7889  1.546531e+12  17.466257  78.367310  17.500685   
44539  ID44540  CUST_19130  1.546531e+12  17.407791  78.445877  17.402725   
44580  ID44581   CUST_5830  1.546531e+12  17.402431  78.392143  17.421440   

        drop_lng  travel_distance  travel_time  trip_fare  \
1      78.367294           11.991    24.075200      119.0   
3      78.516586           11.822    24.037550      121.0   
9      78.501793           13.596    43.543183      142.0   
30     78.425896            6.553    52.538500       60.0   
45     78.396591           21.267    45.602867      243.0   
...          ...              ...          ...        ...   
44519  78.433937           11.567    28.903767      118.0   
44520  78.428490           12.127    66.452333      121.0   
44534  78.356903           18.036    43.141267      201.0   
44539  78.376396           17.200    41.441917      190.0   
44580  78.380241           17.243    39.655350      195.0   

                          date_time             time        date  hour  
1     2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
3     2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
9     2019-01-05 17:30:32.591000080  17:30:32.591000  2019-01-05    17  
30    2019-01-05 17:37:15.687999964  17:37:15.687999  2019-01-05    17  
45    2019-01-05 17:40:23.943000078  17:40:23.943000  2019-01-05    17  
...                             ...              ...         ...   ...  
44519 2019-01-03 15:56:08.453999996  15:56:08.453999  2019-01-03    15  
44520 2019-01-03 15:56:16.016000032  15:56:16.016000  2019-01-03    15  
44534 2019-01-03 15:57:50.066999911  15:57:50.066999  2019-01-03    15  
44539 2019-01-03 15:58:36.032000065  15:58:36.032000  2019-01-03    15  
44580 2019-01-03 16:04:24.164000034  16:04:24.164000  2019-01-03    16  

[3879 rows x 14 columns]
In [76]:
import pandas as pd

# Calculate Q1, Q3 and IQR
Q1 = df[['travel_time', 'trip_fare']].quantile(0.25)
Q3 = df[['travel_time', 'trip_fare']].quantile(0.75)
IQR = Q3 - Q1

# Adjust the boundaries for a more lenient outlier detection
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR

# Identify outliers with the adjusted boundaries
outliers = df[((df[['travel_time', 'trip_fare']] < lower_bound) | (df[['travel_time', 'trip_fare']] > upper_bound)).any(axis=1)]

# Display the outliers
print(f'Identified outliers: \n{outliers}')
Identified outliers: 
       trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
9        ID010    CUST_010  1.546709e+12  17.401064  78.418938  17.434597   
45       ID046    CUST_046  1.546710e+12  17.406568  78.496925  17.519095   
69       ID070    CUST_070  1.546711e+12  17.440002  78.395165  17.388193   
89       ID090    CUST_089  1.546711e+12  17.412432  78.465210  17.490995   
93       ID094    CUST_093  1.546711e+12  17.399290  78.385506  17.444975   
...        ...         ...           ...        ...        ...        ...   
44514  ID44515   CUST_2245  1.546531e+12  17.488409  78.389084  17.502531   
44520  ID44521   CUST_1446  1.546531e+12  17.400417  78.418739  17.441963   
44534  ID44535   CUST_7889  1.546531e+12  17.466257  78.367310  17.500685   
44539  ID44540  CUST_19130  1.546531e+12  17.407791  78.445877  17.402725   
44580  ID44581   CUST_5830  1.546531e+12  17.402431  78.392143  17.421440   

        drop_lng  travel_distance  travel_time  trip_fare  \
9      78.501793           13.596    43.543183      142.0   
45     78.396591           21.267    45.602867      243.0   
69     78.494682           15.933    33.612183      176.0   
89     78.503151           13.595    29.157767      141.0   
93     78.393044            9.283    83.198267       87.0   
...          ...              ...          ...        ...   
44514  78.388596           18.626    60.479533      199.0   
44520  78.428490           12.127    66.452333      121.0   
44534  78.356903           18.036    43.141267      201.0   
44539  78.376396           17.200    41.441917      190.0   
44580  78.380241           17.243    39.655350      195.0   

                          date_time             time        date  hour  
9     2019-01-05 17:30:32.591000080  17:30:32.591000  2019-01-05    17  
45    2019-01-05 17:40:23.943000078  17:40:23.943000  2019-01-05    17  
69    2019-01-05 17:49:53.155999899  17:49:53.155999  2019-01-05    17  
89    2019-01-05 17:57:09.674999952  17:57:09.674999  2019-01-05    17  
93    2019-01-05 17:57:54.219000101  17:57:54.219000  2019-01-05    17  
...                             ...              ...         ...   ...  
44514 2019-01-03 15:55:33.858999968  15:55:33.858999  2019-01-03    15  
44520 2019-01-03 15:56:16.016000032  15:56:16.016000  2019-01-03    15  
44534 2019-01-03 15:57:50.066999911  15:57:50.066999  2019-01-03    15  
44539 2019-01-03 15:58:36.032000065  15:58:36.032000  2019-01-03    15  
44580 2019-01-03 16:04:24.164000034  16:04:24.164000  2019-01-03    16  

[1482 rows x 14 columns]
In [57]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
# Extract only the relevant columns
data = df[['travel_time', 'trip_fare']]

# Create a box plot to visualize outliers
plt.figure(figsize=(12, 6))

# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data['travel_time'], vert=False)
plt.title('Box Plot of Travel Time')
plt.xlabel('Travel Time')

# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare')
plt.xlabel('Trip Fare')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [58]:
import pandas as pd

# Remove rows with travel time > 4000
df_cleaned = df[df['travel_time'] <= 4000]

# Remove rows with trip fare > 1500
df_cleaned = df_cleaned[df_cleaned['trip_fare'] <= 1500]

# Display the first few rows of the cleaned DataFrame to verify
print(df_cleaned.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time        date  hour  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  2019-01-05    17  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  2019-01-05    17  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  2019-01-05    17  
In [59]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df_cleaned is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df_cleaned[['travel_time', 'trip_fare']]

# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))

# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')

# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [60]:
import pandas as pd

# Find the row where the trip fare is the highest
highest_trip_fare_row = df.loc[df['trip_fare'].idxmax()]

# Display the row
print(highest_trip_fare_row)
trip_id                                  ID42531
customer_id                           CUST_18770
timestamp                        1546519406927.0
pick_lat                               17.425972
pick_lng                               78.340454
drop_lat                               17.441185
drop_lng                               78.391289
travel_distance                              9.4
travel_time                           267.825483
trip_fare                                 1670.0
date_time          2019-01-03 12:43:26.927000046
time                             12:43:26.927000
date                                  2019-01-03
hour                                          12
Name: 42530, dtype: object
In [61]:
df
Out[61]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time time date hour
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 16:04:51.043999 2019-01-03 16
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 16:05:05.868999 2019-01-03 16
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 16:05:22.102999 2019-01-03 16
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 16:05:36.428999 2019-01-03 16
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 16:05:49.937999 2019-01-03 16

44587 rows × 14 columns

In [62]:
import pandas as pd

# Find all rows where the trip fare is greater than 1000
high_fare_rows = df[df['trip_fare'] > 1000]

# Display the rows
print(high_fare_rows)
       trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
42530  ID42531  CUST_18770  1.546519e+12  17.425972  78.340454  17.441185   

        drop_lng  travel_distance  travel_time  trip_fare  \
42530  78.391289              9.4   267.825483     1670.0   

                          date_time             time        date  hour  
42530 2019-01-03 12:43:26.927000046  12:43:26.927000  2019-01-03    12  
In [63]:
import pandas as pd

# Remove rows with trip fare greater than 1000
df_cleaned = df[df['trip_fare'] <= 1000]

# Display the first few rows of the cleaned DataFrame to verify
print(df_cleaned.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time        date  hour  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  2019-01-05    17  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  2019-01-05    17  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  2019-01-05    17  
In [64]:
import pandas as pd

# Remove rows with travel time greater than 4000
df_cleaned = df[df['travel_time'] <= 4000]

# Display the first few rows of the cleaned DataFrame to verify
print(df_cleaned.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time        date  hour  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  2019-01-05    17  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  2019-01-05    17  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  2019-01-05    17  
In [65]:
import pandas as pd

# Remove rows with trip fare greater than 1500
df = df[df['trip_fare'] <= 1500]

# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time        date  hour  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  2019-01-05    17  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  2019-01-05    17  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  2019-01-05    17  
In [66]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df_cleaned is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df_cleaned[['travel_time', 'trip_fare']]

# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))

# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')

# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [67]:
import pandas as pd

# Assuming df is your DataFrame
# Remove rows with trip fare greater than 1500
df = df[df['trip_fare'] <= 1500]

# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time        date  hour  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  2019-01-05    17  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  2019-01-05    17  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  2019-01-05    17  
In [68]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df[['travel_time', 'trip_fare']]

# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))

# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')

# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [70]:
import pandas as pd

# Find all rows where the travel time is greater than 4000
long_travel_time_rows = df[df['travel_time'] > 4000]

# Display the rows
print(long_travel_time_rows)
       trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
10316  ID10317   CUST_7419  1.546583e+12  17.370081  78.479561  17.393438   

        drop_lng  travel_distance  travel_time  trip_fare  \
10316  78.492004            6.889    4134.3887       60.0   

                          date_time             time        date  hour  
10316 2019-01-04 06:29:32.750999928  06:29:32.750999  2019-01-04     6  
In [71]:
import pandas as pd

# Assuming df is your DataFrame
# Remove rows with travel time greater than 4000
df = df[df['travel_time'] <= 4000]

# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time        date  hour  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  2019-01-05    17  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  2019-01-05    17  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  2019-01-05    17  
In [72]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df[['travel_time', 'trip_fare']]

# Create a box plot to visualize the cleaned data
plt.figure(figsize=(12, 6))

# Travel Time Box Plot
plt.subplot(1, 2, 1)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')

# Trip Fare Box Plot
plt.subplot(1, 2, 2)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [73]:
import pandas as pd

# Assuming df is your cleaned DataFrame
# Sort the DataFrame by travel_distance in descending order and select the top 7 rows
top_7_travel_distances = df.sort_values(by='travel_distance', ascending=False).head(7)

# Display the results
print(top_7_travel_distances)
       trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
7165    ID7166   CUST_1006  1.546790e+12  17.443495  78.390411  17.444721   
18259  ID18260  CUST_11167  1.546417e+12  17.392820  78.459129  17.405748   
213      ID214    CUST_208  1.546532e+12  17.366833  78.516487  17.341431   
43368  ID43369  CUST_18921  1.546524e+12  17.445814  78.377289  17.448383   
3305    ID3306   CUST_2780  1.546741e+12  17.406488  78.495430  17.459713   
39402  ID39403  CUST_17958  1.546494e+12  17.448320  78.496551  17.448164   
15481  ID15482  CUST_10018  1.546398e+12  17.455072  78.380791  17.434895   

        drop_lng  travel_distance  travel_time  trip_fare  \
7165   78.386597           52.801    14.164500      593.0   
18259  78.443039           46.747    98.218850      521.0   
213    78.528763           40.543    83.046817      445.0   
43368  78.388031           32.523    31.401633      348.0   
3305   78.501694           32.407    57.244633      376.0   
39402  78.465332           32.123   106.770667      372.0   
15481  78.368591           30.980    74.384900      363.0   

                          date_time             time        date  hour  
7165  2019-01-06 15:53:45.615999937  15:53:45.615999  2019-01-06    15  
18259 2019-01-02 08:08:35.635999918  08:08:35.635999  2019-01-02     8  
213   2019-01-03 16:13:48.407999992  16:13:48.407999  2019-01-03    16  
43368 2019-01-03 13:52:11.894999981  13:52:11.894999  2019-01-03    13  
3305  2019-01-06 02:21:08.697000027  02:21:08.697000  2019-01-06     2  
39402 2019-01-03 05:33:04.163000107  05:33:04.163000  2019-01-03     5  
15481 2019-01-02 02:58:15.796000004  02:58:15.796000  2019-01-02     2  
In [74]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming df is your cleaned DataFrame
# Extract only the relevant columns
data_cleaned = df[['travel_distance', 'travel_time', 'trip_fare']]

# Create a box plot to visualize the cleaned data
plt.figure(figsize=(18, 6))

# Travel Distance Box Plot
plt.subplot(1, 3, 1)
plt.boxplot(data_cleaned['travel_distance'], vert=False)
plt.title('Box Plot of Travel Distance (Cleaned)')
plt.xlabel('Travel Distance')

# Travel Time Box Plot
plt.subplot(1, 3, 2)
plt.boxplot(data_cleaned['travel_time'], vert=False)
plt.title('Box Plot of Travel Time (Cleaned)')
plt.xlabel('Travel Time')

# Trip Fare Box Plot
plt.subplot(1, 3, 3)
plt.boxplot(data_cleaned['trip_fare'], vert=False)
plt.title('Box Plot of Trip Fare (Cleaned)')
plt.xlabel('Trip Fare')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [75]:
df
Out[75]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time time date hour
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 16:04:51.043999 2019-01-03 16
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 16:05:05.868999 2019-01-03 16
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 16:05:22.102999 2019-01-03 16
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 16:05:36.428999 2019-01-03 16
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 16:05:49.937999 2019-01-03 16

44585 rows × 14 columns

In [76]:
import pandas as pd

# Remove rows with trip fare greater than 1500
df = df[df['trip_fare'] <= 1500]

# Remove rows with travel time greater than 4000
df = df[df['travel_time'] <= 4000]

# Display the first few rows of the cleaned DataFrame to verify
print(df.head())
  trip_id customer_id     timestamp   pick_lat   pick_lng   drop_lat  \
0   ID001    CUST_001  1.546709e+12  17.442705  78.387878  17.457829   
1   ID002    CUST_002  1.546709e+12  17.490189  78.415512  17.450548   
2   ID003    CUST_003  1.546709e+12  17.370108  78.515045  17.377041   
3   ID004    CUST_004  1.546709e+12  17.439314  78.443001  17.397131   
4   ID005    CUST_005  1.546709e+12  17.432325  78.381966  17.401625   

    drop_lng  travel_distance  travel_time  trip_fare  \
0  78.399056            2.806    12.609667       37.0   
1  78.367294           11.991    24.075200      119.0   
2  78.517921            1.322     8.708300       27.0   
3  78.516586           11.822    24.037550      121.0   
4  78.400032            6.978    16.120867       58.0   

                      date_time             time        date  hour  
0 2019-01-05 17:27:50.210999966  17:27:50.210999  2019-01-05    17  
1 2019-01-05 17:28:29.523999929  17:28:29.523999  2019-01-05    17  
2 2019-01-05 17:28:51.857000113  17:28:51.857000  2019-01-05    17  
3 2019-01-05 17:29:18.403000116  17:29:18.403000  2019-01-05    17  
4 2019-01-05 17:29:46.884000063  17:29:46.884000  2019-01-05    17  
In [77]:
df.describe()
Out[77]:
timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time hour
count 4.458500e+04 44585.000000 44585.000000 44585.000000 44585.000000 44585.000000 44585.000000 44585.000000 44585 44585.000000
mean 1.546632e+12 17.427921 78.435544 17.427891 78.434897 5.094223 15.721807 53.836447 2019-01-04 20:00:45.579670784 9.482180
min 1.546368e+12 17.330339 78.308258 12.921696 77.548103 -1.000000 0.022750 20.000000 2019-01-01 18:32:07.134999990 0.000000
25% 1.546503e+12 17.405327 78.386566 17.405661 78.385010 2.744000 8.428450 36.000000 2019-01-03 08:12:45.868000 5.000000
50% 1.546611e+12 17.432136 78.438866 17.431213 78.438164 4.299000 13.125017 46.000000 2019-01-04 14:08:08.160000 10.000000
75% 1.546772e+12 17.446777 78.480843 17.446907 78.480255 6.679000 20.110433 60.000000 2019-01-06 10:49:36.798000128 13.000000
max 1.546886e+12 17.529791 78.600647 17.736155 78.634804 52.801000 962.766250 959.000000 2019-01-07 18:29:25.111000061 23.000000
std 1.518707e+08 0.030049 0.053331 0.037722 0.054965 3.365011 12.985863 32.543213 NaN 4.708676
In [78]:
import pandas as pd

# Calculate the correlation matrix
correlation_matrix = df[['trip_fare', 'travel_distance', 'travel_time']].corr()

# Display the correlation matrix
print(correlation_matrix)
                 trip_fare  travel_distance  travel_time
trip_fare         1.000000         0.912258     0.618430
travel_distance   0.912258         1.000000     0.666326
travel_time       0.618430         0.666326     1.000000
In [79]:
!pip install scikit-learn
Requirement already satisfied: scikit-learn in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (1.5.2)
Requirement already satisfied: numpy>=1.19.5 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (2.1.2)
Requirement already satisfied: scipy>=1.6.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (1.14.1)
Requirement already satisfied: joblib>=1.2.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from scikit-learn) (3.5.0)
In [80]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Prepare the data
X = df[['travel_distance']]
y = df['trip_fare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model_distance_only = LinearRegression()
model_distance_only.fit(X_train, y_train)

# Make predictions
y_pred = model_distance_only.predict(X_test)

# Evaluate the model
mse_distance_only = mean_squared_error(y_test, y_pred)
print(f'Model using travel distance only - Mean Squared Error: {mse_distance_only}')
Model using travel distance only - Mean Squared Error: 56.29018661135825
In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Prepare the data
X = df[['travel_distance', 'travel_time']]
y = df['trip_fare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model_distance_time = LinearRegression()
model_distance_time.fit(X_train, y_train)

# Make predictions
y_pred = model_distance_time.predict(X_test)

# Evaluate the model
mse_distance_time = mean_squared_error(y_test, y_pred)
print(f'Model using travel distance and travel time - Mean Squared Error: {mse_distance_time}')
Model using travel distance and travel time - Mean Squared Error: 56.04789395759822
In [82]:
# Convert date_time column to datetime if not already done
df['date_time'] = pd.to_datetime(df['date_time'])

# Extract the hour and minute from 'date_time' column
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Prepare the data
X = df[['travel_distance', 'travel_time', 'hour', 'minute']]
y = df['trip_fare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model_enhanced = LinearRegression()
model_enhanced.fit(X_train, y_train)

# Make predictions
y_pred = model_enhanced.predict(X_test)

# Evaluate the model
mse_enhanced = mean_squared_error(y_test, y_pred)
print(f'Model using travel distance, travel time, hour, and minute - Mean Squared Error: {mse_enhanced}')
Model using travel distance, travel time, hour, and minute - Mean Squared Error: 56.080137323933485
In [84]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Prepare the data
X = df[['travel_distance', 'travel_time']]
y = df['trip_fare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Make predictions
y_pred_rf = grid_search.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Model - Mean Squared Error: {mse_rf}')

import joblib

# Assuming rf_model is your trained Random Forest model
joblib.dump(rf_model, 'rf_model.pkl')
Random Forest Model - Mean Squared Error: 32.76109310535315
Out[84]:
['rf_model.pkl']
In [85]:
import pandas as pd
import matplotlib.pyplot as plt

# Group by 'customer_id' and count the number of bookings per customer
customer_bookings = df.groupby('customer_id')['trip_id'].count().reset_index(name='number_of_bookings')

# Sort by number of bookings and select the top 5 customers
top_5_customers = customer_bookings.nlargest(5, 'number_of_bookings')

# Create a bar graph
plt.figure(figsize=(10, 6))
plt.bar(top_5_customers['customer_id'], top_5_customers['number_of_bookings'], color='skyblue')
plt.xlabel('Customer ID', fontsize=12)
plt.ylabel('Bookings', fontsize=12)
plt.title('Our Most Loyal Customers', fontsize=15)
plt.xticks(rotation=0)
plt.show()
No description has been provided for this image
In [86]:
!pip install xgboost
Requirement already satisfied: xgboost in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (2.1.1)
Requirement already satisfied: numpy in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from xgboost) (2.1.2)
Requirement already satisfied: scipy in c:\users\ronit\appdata\local\programs\python\python313\lib\site-packages (from xgboost) (1.14.1)
In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Prepare the data
X = df[['travel_distance', 'travel_time', 'hour', 'minute']]
y = df['trip_fare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f'XGBoost Model - Mean Squared Error: {mse_xgb}')
XGBoost Model - Mean Squared Error: 104.66511439298114
In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
In [89]:
# Assuming df is your DataFrame
# Ensure 'date_time' is in datetime format
df['date_time'] = pd.to_datetime(df['date_time'])

# Extract hour and minute
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Prepare the data
X = df[['travel_distance', 'travel_time']]
y = df['trip_fare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(rf_model, 'rf_model.pkl')

# Evaluate the model
y_pred = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred)
print(f'Random Forest Model - Mean Squared Error: {mse_rf}')
Random Forest Model - Mean Squared Error: 31.47284183021195
In [91]:
import joblib
import pandas as pd

# Load the trained model
rf_model = joblib.load('rf_model.pkl')

# Function to predict trip fare
def predict_trip_fare(travel_distance, travel_time):
    features = pd.DataFrame([[travel_distance, travel_time]], columns=['travel_distance', 'travel_time'])
    predicted_fare = rf_model.predict(features)
    return predicted_fare[0]

# Example usage
trip_distance = 10.0  # Replace with actual distance
travel_time = 20.0  # Replace with actual time

predicted_fare = predict_trip_fare(trip_distance, travel_time)
print(f'Predicted Trip Fare: {predicted_fare}')
Predicted Trip Fare: 75.63
In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

# Prepare the data
X = df[['travel_distance']]
y = df['trip_fare']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(lr_model, 'lr_model.pkl')

# Evaluate the model
y_pred = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred)
print(f'Linear Regression Model - Mean Squared Error: {mse_lr}')
Linear Regression Model - Mean Squared Error: 56.29018661135825
In [93]:
import joblib
import pandas as pd

# Load the trained model
lr_model = joblib.load('lr_model.pkl')

# Function to predict trip fare
def predict_trip_fare(travel_distance):
    features = pd.DataFrame([[travel_distance]], columns=['travel_distance'])
    predicted_fare = lr_model.predict(features)
    return predicted_fare[0]

# Example usage
trip_distance = 2.806 # Replace with actual distance

predicted_fare = predict_trip_fare(trip_distance)
print(f'Predicted Trip Fare: {predicted_fare}')
Predicted Trip Fare: 33.73111737695252
In [94]:
df
Out[94]:
trip_id customer_id timestamp pick_lat pick_lng drop_lat drop_lng travel_distance travel_time trip_fare date_time time date hour minute
0 ID001 CUST_001 1.546709e+12 17.442705 78.387878 17.457829 78.399056 2.806 12.609667 37.0 2019-01-05 17:27:50.210999966 17:27:50.210999 2019-01-05 17 27
1 ID002 CUST_002 1.546709e+12 17.490189 78.415512 17.450548 78.367294 11.991 24.075200 119.0 2019-01-05 17:28:29.523999929 17:28:29.523999 2019-01-05 17 28
2 ID003 CUST_003 1.546709e+12 17.370108 78.515045 17.377041 78.517921 1.322 8.708300 27.0 2019-01-05 17:28:51.857000113 17:28:51.857000 2019-01-05 17 28
3 ID004 CUST_004 1.546709e+12 17.439314 78.443001 17.397131 78.516586 11.822 24.037550 121.0 2019-01-05 17:29:18.403000116 17:29:18.403000 2019-01-05 17 29
4 ID005 CUST_005 1.546709e+12 17.432325 78.381966 17.401625 78.400032 6.978 16.120867 58.0 2019-01-05 17:29:46.884000063 17:29:46.884000 2019-01-05 17 29
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
44582 ID44583 CUST_19137 1.546531e+12 17.443661 78.391968 17.451042 78.371658 5.665 12.573883 55.0 2019-01-03 16:04:51.043999910 16:04:51.043999 2019-01-03 16 4
44583 ID44584 CUST_19138 1.546532e+12 17.439289 78.396118 17.449976 78.389160 9.292 17.937717 76.0 2019-01-03 16:05:05.868999958 16:05:05.868999 2019-01-03 16 5
44584 ID44585 CUST_5061 1.546532e+12 17.363689 78.535194 17.374418 78.529823 5.485 17.403850 55.0 2019-01-03 16:05:22.102999926 16:05:22.102999 2019-01-03 16 5
44585 ID44586 CUST_19139 1.546532e+12 17.401539 78.570076 17.416904 78.591362 2.326 14.236767 35.0 2019-01-03 16:05:36.428999901 16:05:36.428999 2019-01-03 16 5
44586 ID44587 CUST_15562 1.546532e+12 17.385243 78.479896 17.394102 78.499550 3.204 7.085400 37.0 2019-01-03 16:05:49.937999964 16:05:49.937999 2019-01-03 16 5

44585 rows × 15 columns

In [97]:
df.to_csv('rapido_analysis.csv', index=False)
In [ ]: